Looking at the poly-A tails of human RNA
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
from itertools import groupby
def rna_adenylation(path):
items = []
for (header, seq) in fasta_iter(path):
index = -1
while(seq[index] == 'a'):
index = index - 1
a_tail_len = index * -1 - 1
items.append((header, [len(seq), a_tail_len]))
return pd.DataFrame.from_items(items, orient="index", columns=['rna_len', 'a_tail_len'])
# https://www.biostars.org/p/710/
def fasta_iter(fasta_name):
"""
given a fasta file. yield tuples of header, sequence
"""
fh = open(fasta_name)
# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
faiter = (x[1] for x in groupby(fh, lambda line: line[0] == ">"))
for header in faiter:
# drop the ">"
header = header.next()[1:].strip()
# join all sequence lines to one.
seq = "".join(s.strip() for s in faiter.next())
yield header, seq
In [3]:
df = rna_adenylation("data/mrna.fa")
df.head(10)
Out[3]:
In [4]:
df.describe()
Out[4]:
In [9]:
%matplotlib notebook
masks = [(df.a_tail_len <= 3),
(df.a_tail_len > 3) & (df.a_tail_len <= 50),
(df.a_tail_len > 50) & (df.a_tail_len <= 100),
(df.a_tail_len > 100) & (df.a_tail_len <= 150),
(df.a_tail_len > 150)]
plt.figure(figsize=(12,9))
for idx, mask in enumerate(masks):
plt.subplot(len(masks), 1, idx+1)
df[mask]["a_tail_len"].plot.hist(bins=50, alpha=0.5, color='k')
plt.show()
Most of the RNA didn't have poly-A tails
In [6]:
df[(df.a_tail_len == 0)].describe()
Out[6]:
In [7]:
%matplotlib notebook
# Remove a few outliers and put them into a table instead
df[(df.rna_len < 20000)].plot.scatter(x='rna_len', y='a_tail_len', figsize=(12,9))
plt.show()
In [8]:
# The outliers
df[(df.rna_len >= 20000)]
Out[8]: